In [1]:
import os
import pandas as pd
os.chdir('../data/')
import pickle
#pickle.dump( results , open( "reditt_autism.p", "wb" ) )
results = pickle.load( open( "reditt_autism.p", "rb" ) )
In [2]:
len(results)
Out[2]:
In [3]:
columns=['post id','title','text','href','user id']
df = pd.DataFrame(columns=columns)
columns=['user description']
df_users = pd.DataFrame(columns=columns)
df_users.index.name="user id"
In [4]:
def add_post(post_id,title,text,url,user_name):
global df,df_users
# Update user dataframe:
#
newrow={"user description":user_name}
if user_name not in df_users['user description'].values:
df_users.loc[len(df_users)]=newrow
#
user_id = df_users.loc[df_users['user description'] == user_name ].index.values[0]
#
# Add post data to dataframe
#
newrow={"post id":post_id,
"title":title,
"text":text,
"href":url,
"user id":user_id}
df.loc[len(df.values)]=newrow
In [5]:
post_id=0
not_shown=""
for result in results:
url=result['url']
title=result['title']
text=result['selftext']
user_name=result['author']
if len(text) > 3:
post_id=post_id+1
add_post(post_id,title,text,url,user_name)
else:
not_shown=not_shown+text.strip()
not_shown
Out[5]:
In [6]:
df.head(2)
Out[6]:
In [7]:
df_users.head(2)
Out[7]:
In [8]:
print(len(df))
df.to_csv('reditt-posts.csv',index=False)
df_users.to_csv('reditt-users.csv')